library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)

Research Focus:

As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. We hope to analyze the relationship between a country’s population level, population density, and continent categorization on the spread of COVID-19.

Data Access

Reading in the Data:

Data Source 1: COVID

COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
  nrow()
[1] 9487
COVID %>%
  names()
  [1] "total.covid.cases.deaths.per.million" "X"                                    "X.1"                                 
  [4] "X.2"                                  "X.3"                                  "X.4"                                 
  [7] "X.5"                                  "X.6"                                  "X.7"                                 
 [10] "X.8"                                  "X.9"                                  "X.10"                                
 [13] "X.11"                                 "X.12"                                 "X.13"                                
 [16] "X.14"                                 "X.15"                                 "X.16"                                
 [19] "X.17"                                 "X.18"                                 "X.19"                                
 [22] "X.20"                                 "X.21"                                 "X.22"                                
 [25] "X.23"                                 "X.24"                                 "X.25"                                
 [28] "X.26"                                 "X.27"                                 "X.28"                                
 [31] "X.29"                                 "X.30"                                 "X.31"                                
 [34] "X.32"                                 "X.33"                                 "X.34"                                
 [37] "X.35"                                 "X.36"                                 "X.37"                                
 [40] "X.38"                                 "X.39"                                 "X.40"                                
 [43] "X.41"                                 "X.42"                                 "X.43"                                
 [46] "X.44"                                 "X.45"                                 "X.46"                                
 [49] "X.47"                                 "X.48"                                 "X.49"                                
 [52] "X.50"                                 "X.51"                                 "X.52"                                
 [55] "X.53"                                 "X.54"                                 "X.55"                                
 [58] "X.56"                                 "X.57"                                 "X.58"                                
 [61] "X.59"                                 "X.60"                                 "X.61"                                
 [64] "X.62"                                 "X.63"                                 "X.64"                                
 [67] "X.65"                                 "X.66"                                 "X.67"                                
 [70] "X.68"                                 "X.69"                                 "X.70"                                
 [73] "X.71"                                 "X.72"                                 "X.73"                                
 [76] "X.74"                                 "X.75"                                 "X.76"                                
 [79] "X.77"                                 "X.78"                                 "X.79"                                
 [82] "X.80"                                 "X.81"                                 "X.82"                                
 [85] "X.83"                                 "X.84"                                 "X.85"                                
 [88] "X.86"                                 "X.87"                                 "X.88"                                
 [91] "X.89"                                 "X.90"                                 "X.91"                                
 [94] "X.92"                                 "X.93"                                 "X.94"                                
 [97] "X.95"                                 "X.96"                                 "X.97"                                
[100] "X.98"                                 "X.99"                                 "X.100"                               
[103] "X.101"                                "X.102"                                "X.103"                               
[106] "X.104"                                "X.105"                                "X.106"                               
[109] "X.107"                                "X.108"                                "X.109"                               
[112] "X.110"                                "X.111"                                "X.112"                               
[115] "X.113"                                "X.114"                                "X.115"                               
[118] "X.116"                                "X.117"                                "X.118"                               
[121] "X.119"                                "X.120"                                "X.121"                               
[124] "X.122"                                "X.123"                                "X.124"                               
[127] "X.125"                                "X.126"                                "X.127"                               
[130] "X.128"                                "X.129"                                "X.130"                               
[133] "X.131"                                "X.132"                                "X.133"                               
[136] "X.134"                                "X.135"                                "X.136"                               
[139] "X.137"                                "X.138"                                "X.139"                               
[142] "X.140"                                "X.141"                                "X.142"                               
[145] "X.143"                                "X.144"                                "X.145"                               
[148] "X.146"                                "X.147"                                "X.148"                               
[151] "X.149"                                "X.150"                                "X.151"                               
[154] "X.152"                                "X.153"                                "X.154"                               
[157] "X.155"                                "X.156"                                "X.157"                               
[160] "X.158"                                "X.159"                                "X.160"                               
[163] "X.161"                                "X.162"                                "X.163"                               
[166] "X.164"                                "X.165"                                "X.166"                               
[169] "X.167"                                "X.168"                                "X.169"                               
[172] "X.170"                                "X.171"                                "X.172"                               
[175] "X.173"                                "X.174"                                "X.175"                               
[178] "X.176"                                "X.177"                                "X.178"                               
[181] "X.179"                                "X.180"                                "X.181"                               
[184] "X.182"                                "X.183"                                "X.184"                               
[187] "X.185"                                "X.186"                                "X.187"                               
[190] "X.188"                                "X.189"                                "X.190"                               
[193] "X.191"                                "X.192"                                "X.193"                               
[196] "X.194"                                "X.195"                                "X.196"                               
[199] "X.197"                                "X.198"                                "X.199"                               
[202] "X.200"                                "X.201"                                "X.202"                               
[205] "X.203"                                "X.204"                                "X.205"                               
[208] "X.206"                                "X.207"                                "X.208"                               
[211] "X.209"                                "X.210"                                "X.211"                               
[214] "X.212"                                "X.213"                                "X.214"                               
[217] "X.215"                                "X.216"                                "X.217"                               
[220] "X.218"                                "X.219"                                "X.220"                               
[223] "X.221"                                "X.222"                                "X.223"                               
[226] "X.224"                                "X.225"                                "X.226"                               
[229] "X.227"                                "X.228"                                "X.229"                               
[232] "X.230"                                "X.231"                                "X.232"                               
[235] "X.233"                                "X.234"                                "X.235"                               
[238] "X.236"                                "X.237"                                "X.238"                               
[241] "X.239"                                "X.240"                                "X.241"                               
[244] "X.242"                                "X.243"                                "X.244"                               
[247] "X.245"                                "X.246"                                "X.247"                               
[250] "X.248"                                "X.249"                                "X.250"                               
[253] "X.251"                                "X.252"                                "X.253"                               
[256] "X.254"                               
COVID %>%
  head()

Data Source 2: CountryData

CountryData
CountryData %>%
  nrow()
[1] 256
CountryData %>%
  names()
 [1] "country"           "area"              "pop"               "growth"            "birth"             "death"            
 [7] "migr"              "maternal"          "infant"            "life"              "fert"              "health"           
[13] "HIVrate"           "HIVpeople"         "HIVdeath"          "obesity"           "underweight"       "educ"             
[19] "unemploymentYouth" "GDP"               "GDPgrowth"         "GDPcapita"         "saving"            "indProd"          
[25] "labor"             "unemployment"      "family"            "tax"               "budget"            "debt"             
[31] "inflation"         "discount"          "lending"           "narrow"            "broad"             "credit"           
[37] "shares"            "balance"           "exports"           "imports"           "gold"              "externalDebt"     
[43] "homeStock"         "abroadStock"       "elecProd"          "elecCons"          "elecExp"           "elecImp"          
[49] "elecCap"           "elecFossil"        "elecNuc"           "elecHydro"         "elecRenew"         "oilProd"          
[55] "oilExp"            "oilImp"            "oilRes"            "petroProd"         "petroCons"         "petroExp"         
[61] "petroImp"          "gasProd"           "gasCons"           "gasExp"            "gasImp"            "gasRes"           
[67] "mainlines"         "cell"              "netHosts"          "netUsers"          "airports"          "railways"         
[73] "roadways"          "waterways"         "marine"            "military"         
CountryData %>%
  head()

Data Source 3: countryRegions

countryRegions
countryRegions %>%
  nrow()
[1] 254
countryRegions %>%
  names()
 [1] "ISO3"         "ADMIN"        "REGION"       "continent"    "GEO3major"    "GEO3"         "IMAGE24"      "GLOCAF"      
 [9] "Stern"        "SRESmajor"    "SRES"         "GBD"          "AVOIDnumeric" "AVOIDname"    "LDC"          "SID"         
[17] "LLDC"        
countryRegions %>%
  head()

Data Wrangling

Tidying the COVID Dataset

COVID

Since our analysis is focused on the spread of COVID-19, we select only columns which pertain to the number of COVID-19 cases in countries over time.

TidyCOVID <- COVID %>%
  rename(country = total.covid.cases.deaths.per.million ) %>%
  rename( Code = X ) %>%
  rename(date = X.1 ) %>%
  rename(casesPerMillion = X.3) %>%
  filter(row_number() > 1) %>%
  subset(select = c(1,2,3,5)) %>%
  mutate( country = as.character(country) ) %>%
  mutate(date = mdy(date)) %>%
  mutate(casesPerMillion = as.integer(casesPerMillion) - 1)
TidyCOVID

EVELYN pls explain what an instance represents

Wrangling of countryRegions Dataset

We will extract the ISO3 country code and continent from the countryRegions data. Since naming conventions of countries is variate, the ISO3 country code allows us a standardized demarcation of country with which to join with other data tables.

Labels <-
  countryRegions %>%
  subset(select = c("ISO3", "REGION")) %>%
  rename(continent = REGION)
Labels

Data Extraction of CountryData Dataset

We will select the aspects of CountryData relevant to our analysis. These attributes are: area (sq km) and pop (number of people).

RelevantCountryData <-
  CountryData %>%
  subset(select = c(1,2,3)) %>%
  mutate(popdensity = pop/area)
RelevantCountryData

Joining Data & Relevant Variable Synthesis

Calculate the number of cases in each country by multiplying casesPerMillion by the country’s population (in millions).

COVIDGrowth <-
  inner_join(TidyCOVID, RelevantCountryData, by = c("country")) %>%
  mutate("cases" = (casesPerMillion * round(pop/1000000, digits = 0)))
COVIDGrowth <-
  COVIDGrowth %>%
  left_join(Labels, by = c("Code" = "ISO3"))
Column `Code`/`ISO3` joining factor and character vector, coercing into character vector
COVIDGrowth

Creation of new Data Table: FirstInstance

This table records the first date that a country recorded a nonzero number of COVID-19 cases. This datagraph will help us visualize when countries first became infected.

FirstInstance <-
  COVIDGrowth %>%
  filter(cases != 0) %>%
  group_by(country, continent) %>%
  summarise(beginningofspread = min(date))
  
FirstInstance

This table averages the number of case increase per day from the first day a country had COVID-19 to the most recent in the data table (April 5 2020)

DailySpread <-
  left_join(COVIDGrowth, FirstInstance, by = c("country")) %>%
  filter(date == "2020-04-05") %>%
  mutate(dayselapsed = date - beginningofspread) %>%
  mutate(dailyspread = cases / as.numeric(dayselapsed) ) %>%
  mutate(dailyspreadpermillion = casesPerMillion / as.numeric(dayselapsed) ) %>%
  subset(select = c("country", "beginningofspread", "dailyspread", "dailyspreadpermillion"))
DailySpread$dailyspread[is.na(DailySpread$dailyspread)] <- 0
DailySpread$dailyspreadpermillion[is.na(DailySpread$dailyspreadpermillion)] <- 0
DailySpread
COVIDFinal <-
  left_join(COVIDGrowth, DailySpread, by = c("country"))
COVIDFinal

Data Visualization

Overall Growth of COVID-19 Over Time

COVIDFinal %>%
  group_by(date) %>%
  summarise(totalcases = sum(cases)) %>%
  ggplot(aes(x = date, y = totalcases)) + 
  geom_point() +
  xlab("Date") +
  ylab("COVID-19 Cases")

Continental Growth of COVID-19 Over Time

na.omit(COVIDFinal) %>%
  group_by(date, continent) %>%
  summarise(totalcases = sum(cases)) %>%
  ggplot(aes(x = date, y = totalcases)) + 
  geom_point() +
  facet_wrap(~continent) +
  xlab("Date") +
  ylab("COVID-19 Cases")

Infection of COVID-19 into countries over time

na.omit(FirstInstance) %>%
  ggplot(aes(x = beginningofspread, fill = continent)) +
  geom_dotplot(stackgroups = TRUE, binwidth = 1, binpositions="all") +
  xlab("Country's First Case of COVID-19") +
  theme(panel.background = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank())

Which countries have the highest infection rates?

  
COVIDFinal %>%
  group_by(country) %>%
  summarise(dailyspread = mean(dailyspread)) %>%
  arrange(desc(dailyspread)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) +
  geom_bar(stat="identity", position = 'stack', width=.9) +
  theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  ylab("Average Number Infected Per Day") +
  theme(axis.title.x = element_blank())

Compare this to which countries have the highest populations

COVIDFinal %>%
  group_by(country) %>%
  summarise(pop = mean(pop)) %>%
  arrange(desc(pop)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(country, desc(pop)), y= pop)) +
  geom_bar(stat="identity", position = 'stack', width=.9) +
  theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  ylab("Population") +
  theme(axis.title.x = element_blank())

Let’s visualize the relationship between population and COVID-19 spread on the same data frame… with an awareness of the continental distribution

na.omit(COVIDFinal) %>%
  ggplot(aes(x = pop, y = dailyspread, color = continent)) + 
  geom_point() +
  xlab("Population of Country") +
  ylab("Average Number Infected Per Day")

Does the relationship hold up after removing the largest outliers (China and India)?

Does the positive relationship hold up across all continents?

na.omit(COVIDFinal) %>%
  ggplot(aes(x = pop, y = dailyspread, color = continent)) + 
  geom_point() +
  xlim(0,500000000) +
  ylim(0, 40000) +
  xlab("Population of Country") +
  ylab("Average Number Infected Per Day") +
  stat_smooth(method = lm) 

A prevailing explanation for the spread of COVID-19 is social closeness, therefore, we hypothesize that countries with the highest population density will have the highest proportional rates of infection. To measure the proportional rates of infection, it is essential to use a standardized metric, such that the data is not skewed towards the countries with simply the most people. Therefore, we will analyze the variable “population per million infected per day”, which captures a representation of the percentage of a country’s population that is effective. If our hypothesis is correct, the countries with the highest population per million infected per day will be those with the highest population density.

Which countries have the highest infection rates per million?

  
COVIDFinal %>%
  group_by(country) %>%
  summarise(dailyspreadpermillion = mean(dailyspreadpermillion)) %>%
  arrange(desc(dailyspreadpermillion)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(country, desc(dailyspreadpermillion)), y= dailyspreadpermillion)) +
  geom_bar(stat="identity", position = 'stack', width=.9) +
  theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  ylab("Population Per Million Infected Per Day") +
  theme(axis.title.x = element_blank())

Which countries have the highest population density?

  
COVIDFinal %>%
  group_by(country) %>%
  summarise(popdensity = mean(popdensity)) %>%
  arrange(desc(popdensity)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(country, desc(popdensity)), y= popdensity)) +
  geom_bar(stat="identity", position = 'stack', width=.9) +
  theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  ylab("Population Density (people/sq km)") +
  theme(axis.title.x = element_blank())

Is there a visible correlation between these attributes?

na.omit(COVIDFinal) %>%
  ggplot(aes(x = popdensity, y = dailyspreadpermillion)) +
  geom_point() 

What if faceted by continent?

na.omit(COVIDFinal) %>%
  ggplot(aes(x = popdensity, y = dailyspreadpermillion)) +
  geom_point() + 
  facet_wrap(~continent) + 
  xlim(0,1500)

Conclusion

evelyn pls write a conclusion here… something about there being a correlation btwn population and spread, but once standardized, the correlation is far less evident… we can not prove a correlation between population density and infection rate/million.

also i defined this function (because we need a user defined function and to use wide/narrow form), but unsure exactly where to put it,, lmk if u think of a good place.

Country Comparison Function

Easy to Traverse– Wide Countries

WideCountries <-
  COVIDFinal %>%
  subset(select = c("country", "date", "cases")) %>%
  spread(key = date, value = cases)
WideCountries[is.na(WideCountries)] <- 0
WideCountries

compareCOVID() definition

compareCOVID <- function(countryA, countryB) {
  
    A <-
    WideCountries %>%
    filter(country == countryA)
  
  B <-
    WideCountries %>%
    filter(country == countryB)
  A <-
    A %>%
    gather(key = date, value = count) %>%
    filter(row_number() > 1) %>%
    mutate(date = lubridate::ymd(date)) %>%
    mutate(count = as.numeric(count)) %>%
    mutate(country = countryA)
  
  B <-
    B %>%
    gather(key = date, value = count) %>%
    filter(row_number() > 1) %>%
    mutate(date = lubridate::ymd(date))%>%
    mutate(count = as.numeric(count)) %>%
    mutate(country = countryB)
  
  
  GG <-
    rbind(A,B)
  
  return( ggplot(GG, aes(x = date, y = count, color = country)) +
    stat_smooth(formula = y ~ x, method = "loess") +
      ylab("Number of COVID-19 Cases") +
      xlab("Date"))
  
}

Ex. of compareCOVID() in use:

compareCOVID("China", "United States")

compareCOVID("Japan", "Russia")

compareCOVID("Puerto Rico", "Belgium")

---
title: "Final Project"
output: html_notebook
authors: "Joseph Pevner and Evelyn Murray"
---

```{r}
library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)
```

## Research Focus:

As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges-- what factors of a country contribute to the spread of Coronavirus. We hope to analyze the relationship between a country's population level, population density, and continent categorization on the spread of COVID-19.



## Data Access

### Reading in the Data:


#### Data Source 1: COVID
```{r}
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
```

```{r}
COVID %>%
  nrow()
```
```{r}
COVID %>%
  names()
```
```{r}
COVID %>%
  head()
```



#### Data Source 2: CountryData
```{r}
CountryData
```

```{r}
CountryData %>%
  nrow()
```
```{r}
CountryData %>%
  names()
```
```{r}
CountryData %>%
  head()
```

#### Data Source 3: countryRegions

```{r}
countryRegions
```
```{r}
countryRegions %>%
  nrow()
```
```{r}
countryRegions %>%
  names()
```
```{r}
countryRegions %>%
  head()
```




## Data Wrangling

### Tidying the COVID Dataset

```{r}
COVID
```

Since our analysis is focused on the spread of COVID-19, we select only columns which pertain to the number of COVID-19 cases in countries over time.

```{r}
TidyCOVID <- COVID %>%
  rename(country = total.covid.cases.deaths.per.million ) %>%
  rename( Code = X ) %>%
  rename(date = X.1 ) %>%
  rename(casesPerMillion = X.3) %>%
  filter(row_number() > 1) %>%
  subset(select = c(1,2,3,5)) %>%
  mutate( country = as.character(country) ) %>%
  mutate(date = mdy(date)) %>%
  mutate(casesPerMillion = as.integer(casesPerMillion) - 1)


```


```{r}
TidyCOVID

```



EVELYN pls explain what an instance represents


### Wrangling of countryRegions Dataset

We will extract the ISO3 country code and continent from the countryRegions data. Since naming conventions of countries is variate, the ISO3 country code allows us a standardized demarcation of country with which to join with other data tables.

```{r}
Labels <-
  countryRegions %>%
  subset(select = c("ISO3", "REGION")) %>%
  rename(continent = REGION)

Labels

```


### Data Extraction of CountryData Dataset

We will select the aspects of CountryData relevant to our analysis. These attributes are: area (sq km) and pop (number of people).

```{r}

RelevantCountryData <-
  CountryData %>%
  subset(select = c(1,2,3)) %>%
  mutate(popdensity = pop/area)

RelevantCountryData
```

### Joining Data & Relevant Variable Synthesis

Calculate the number of cases in each country by multiplying casesPerMillion by the country's population (in millions). 
```{r}

COVIDGrowth <-
  inner_join(TidyCOVID, RelevantCountryData, by = c("country")) %>%
  mutate("cases" = (casesPerMillion * round(pop/1000000, digits = 0)))

COVIDGrowth <-
  COVIDGrowth %>%
  left_join(Labels, by = c("Code" = "ISO3"))

COVIDGrowth
```

### Creation of new Data Table: FirstInstance

This table records the first date that a country recorded a nonzero number of COVID-19 cases. This datagraph will help us visualize when countries first became infected.
```{r}

FirstInstance <-
  COVIDGrowth %>%
  filter(cases != 0) %>%
  group_by(country, continent) %>%
  summarise(beginningofspread = min(date))
  
FirstInstance


```




This table averages the number of case increase per day from the first day a country had COVID-19 to the most recent in the data table (April 5 2020)

```{r}

DailySpread <-
  left_join(COVIDGrowth, FirstInstance, by = c("country")) %>%
  filter(date == "2020-04-05") %>%
  mutate(dayselapsed = date - beginningofspread) %>%
  mutate(dailyspread = cases / as.numeric(dayselapsed) ) %>%
  mutate(dailyspreadpermillion = casesPerMillion / as.numeric(dayselapsed) ) %>%
  subset(select = c("country", "beginningofspread", "dailyspread", "dailyspreadpermillion"))

DailySpread$dailyspread[is.na(DailySpread$dailyspread)] <- 0
DailySpread$dailyspreadpermillion[is.na(DailySpread$dailyspreadpermillion)] <- 0


DailySpread
```



```{r}

COVIDFinal <-
  left_join(COVIDGrowth, DailySpread, by = c("country"))


```



```{r}
COVIDFinal

```








## Data Visualization


### Overall Growth of COVID-19 Over Time
```{r}

COVIDFinal %>%
  group_by(date) %>%
  summarise(totalcases = sum(cases)) %>%
  ggplot(aes(x = date, y = totalcases)) + 
  geom_point() +
  xlab("Date") +
  ylab("COVID-19 Cases")

```




### Continental Growth of COVID-19 Over Time

```{r}

na.omit(COVIDFinal) %>%
  group_by(date, continent) %>%
  summarise(totalcases = sum(cases)) %>%
  ggplot(aes(x = date, y = totalcases)) + 
  geom_point() +
  facet_wrap(~continent) +
  xlab("Date") +
  ylab("COVID-19 Cases")
```


### Infection of COVID-19 into countries over time
```{r}

na.omit(FirstInstance) %>%
  ggplot(aes(x = beginningofspread, fill = continent)) +
  geom_dotplot(stackgroups = TRUE, binwidth = 1, binpositions="all") +
  xlab("Country's First Case of COVID-19") +
  theme(panel.background = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank())
```





### Which countries have the highest infection rates?


```{r}
  
COVIDFinal %>%
  group_by(country) %>%
  summarise(dailyspread = mean(dailyspread)) %>%
  arrange(desc(dailyspread)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) +
  geom_bar(stat="identity", position = 'stack', width=.9) +
  theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  ylab("Average Number Infected Per Day") +
  theme(axis.title.x = element_blank())



```

### Compare this to which countries have the highest populations

```{r}

COVIDFinal %>%
  group_by(country) %>%
  summarise(pop = mean(pop)) %>%
  arrange(desc(pop)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(country, desc(pop)), y= pop)) +
  geom_bar(stat="identity", position = 'stack', width=.9) +
  theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  ylab("Population") +
  theme(axis.title.x = element_blank())


```


### Let's visualize the relationship between population and COVID-19 spread on the same data frame... with an awareness of the continental distribution


```{r}

na.omit(COVIDFinal) %>%
  ggplot(aes(x = pop, y = dailyspread, color = continent)) + 
  geom_point() +
  xlab("Population of Country") +
  ylab("Average Number Infected Per Day")



```



### Does the relationship hold up after removing the largest outliers (China and India)?
### Does the positive relationship hold up across all continents?
```{r}

na.omit(COVIDFinal) %>%
  ggplot(aes(x = pop, y = dailyspread, color = continent)) + 
  geom_point() +
  xlim(0,500000000) +
  ylim(0, 40000) +
  xlab("Population of Country") +
  ylab("Average Number Infected Per Day") +
  stat_smooth(method = lm) 



```



### A prevailing explanation for the spread of COVID-19 is social closeness, therefore, we hypothesize that countries with the highest population density will have the highest proportional rates of infection. To measure the proportional rates of infection, it is essential to use a standardized metric, such that the data is not skewed towards the countries with simply the most people. Therefore, we will analyze the variable "population per million infected per day", which captures a representation of the percentage of a country's population that is effective. If our hypothesis is correct, the countries with the highest population per million infected per day will be those with the highest population density.


### Which countries have the highest infection rates per million?


```{r}
  
COVIDFinal %>%
  group_by(country) %>%
  summarise(dailyspreadpermillion = mean(dailyspreadpermillion)) %>%
  arrange(desc(dailyspreadpermillion)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(country, desc(dailyspreadpermillion)), y= dailyspreadpermillion)) +
  geom_bar(stat="identity", position = 'stack', width=.9) +
  theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  ylab("Population Per Million Infected Per Day") +
  theme(axis.title.x = element_blank())



```

### Which countries have the highest population density?

```{r}
  
COVIDFinal %>%
  group_by(country) %>%
  summarise(popdensity = mean(popdensity)) %>%
  arrange(desc(popdensity)) %>%
  head(20) %>%
  ggplot(aes(x = reorder(country, desc(popdensity)), y= popdensity)) +
  geom_bar(stat="identity", position = 'stack', width=.9) +
  theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
  ylab("Population Density (people/sq km)") +
  theme(axis.title.x = element_blank())



```




### Is there a visible correlation between these attributes?


```{r}
na.omit(COVIDFinal) %>%
  ggplot(aes(x = popdensity, y = dailyspreadpermillion)) +
  geom_point() 
```


### What if faceted by continent?

```{r}
na.omit(COVIDFinal) %>%
  ggplot(aes(x = popdensity, y = dailyspreadpermillion)) +
  geom_point() + 
  facet_wrap(~continent) + 
  xlim(0,1500)

```






## Conclusion



evelyn pls write a conclusion here... something about there being a correlation btwn population and spread, but once standardized, the correlation is far less evident... we can not prove a correlation between population density and infection rate/million.


also i defined this function (because we need a user defined function and to use wide/narrow form), but unsure exactly where to put it,, lmk if u think of a good place.





## Country Comparison Function


### Easy to Traverse-- Wide Countries

```{r}

WideCountries <-
  COVIDFinal %>%
  subset(select = c("country", "date", "cases")) %>%
  spread(key = date, value = cases)

WideCountries[is.na(WideCountries)] <- 0

WideCountries

```

### compareCOVID() definition

```{r}

compareCOVID <- function(countryA, countryB) {
  
    A <-
    WideCountries %>%
    filter(country == countryA)
  
  B <-
    WideCountries %>%
    filter(country == countryB)

  A <-
    A %>%
    gather(key = date, value = count) %>%
    filter(row_number() > 1) %>%
    mutate(date = lubridate::ymd(date)) %>%
    mutate(count = as.numeric(count)) %>%
    mutate(country = countryA)
  
  B <-
    B %>%
    gather(key = date, value = count) %>%
    filter(row_number() > 1) %>%
    mutate(date = lubridate::ymd(date))%>%
    mutate(count = as.numeric(count)) %>%
    mutate(country = countryB)
  
  
  GG <-
    rbind(A,B)
  
  return( ggplot(GG, aes(x = date, y = count, color = country)) +
    stat_smooth(formula = y ~ x, method = "loess") +
      ylab("Number of COVID-19 Cases") +
      xlab("Date"))
  
}




```


### Ex. of compareCOVID() in use:

```{r}

compareCOVID("China", "United States")

compareCOVID("Japan", "Russia")

compareCOVID("Puerto Rico", "Belgium")

```



